HDBSCAN¶

HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm that improves upon DBSCAN by allowing the detection of clusters with varying densities. It is particularly effective for discovering clusters in datasets with complex structures or noise.

Key Characteristics of HDBSCAN:¶

  • Automatic Cluster Detection: Unlike K-Means, HDBSCAN automatically determines the number of clusters based on data density.
  • Noise Handling: HDBSCAN can detect outliers and noise, assigning them to a special label (-1 by default).
  • Hierarchical Clustering: The algorithm starts by creating a hierarchy of clusters and condenses it into a flat partition.
  • Parameters:
    • min_cluster_size: Defines the minimum size of clusters.
    • min_samples: Controls how conservative the clustering is.
  • Distance Metrics: HDBSCAN supports various distance metrics like Euclidean, Manhattan, and even custom metrics (e.g., cosine distance, as demonstrated).

Below I explore using HDBSCAN on word embedding vectors to see how well it can cluster similar terms.

In [2]:
from numpy import dot
from numpy.linalg import norm
from openai import AzureOpenAI
from openai import OpenAI
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import hdbscan
from hdbscan import BranchDetector
import os
import plotly.io as pio
import plotly.offline as pyo

# load environment vars
import dotenv
dotenv.load_dotenv(override=True)

#nlp = spacy.load('en_core_web_sm')
open_ai_api_key = os.getenv("OPEN_AI_API_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")

client = AzureOpenAI(
  api_key = open_ai_api_key,  
  api_version = '2023-07-01-preview',
  azure_endpoint = endpoint
)

def generate_embeddings(text, model="text-embedding-ada-002"): 
    """This generates embeddings"""
    try:
        return client.embeddings.create(input = [text], model=model).data[0].embedding
    except:
        return None

# Function to calculate cosine similarity
def calculate_cosine_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]
In [3]:
term_to_class = {
    # Cluster 0 (Terrestrial animals)
    'cat': 0,
    'dog': 0,
    'deer': 0,
    'lion': 0,
    'wolf': 0,
    'bear': 0,

    # Cluster 1 (Cephalopods)
    'octopus': 1,
    'squid': 1,

    # Cluster 2 (Reptiles)
    'alligator': 2,
    'crocodile': 2,
    'lizard': 2,
    'iguana': 2,
    'gecko': 2,

    # Cluster 3 (aquatic animals)
    'tuna':3, 
    'rockfish':3,
    'salmon':3, 
    'goldfish':3,
    'swordfish':3,
    'herring':3,
    'dolphin':3
}
# Updated terms list to match the expanded dictionary
terms = list(term_to_class.keys())

# Assuming `generate_embeddings` is a function that generates embeddings for each term
evs = {}

# Populate the evs dictionary with embeddings for each term
for term in terms:
    evs[term] = generate_embeddings(term)
In [4]:
evs.keys()
Out[4]:
dict_keys(['cat', 'dog', 'deer', 'lion', 'wolf', 'bear', 'octopus', 'squid', 'alligator', 'crocodile', 'lizard', 'iguana', 'gecko', 'tuna', 'rockfish', 'salmon', 'goldfish', 'swordfish', 'herring', 'dolphin'])
In [5]:
evs_list = [evs[term] for term in terms]
In [6]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

def hdbscan_cluster_plot(evs, terms, term_to_class_dict, hdbscan_dist_metric, alpha=.001, leaf_size=1, min_cluster_size=2, min_samples=1):
    # Reduce embeddings to 2D using t-SNE
    tsne_model = TSNE(n_components=2, random_state=42, perplexity=1, learning_rate=.001, max_iter=10000)
    reduced_embeddings = tsne_model.fit_transform(np.array(evs))

    # Initialize HDBSCAN clusterer
    clusterer = hdbscan.HDBSCAN(algorithm='best', 
                                alpha=alpha,
                                gen_min_span_tree=True, 
                                cluster_selection_epsilon=.1,
                                leaf_size=leaf_size, 
                                metric=hdbscan_dist_metric,
                                min_cluster_size=min_cluster_size, 
                                min_samples=min_samples, 
                                branch_detection_data=True, 
                                p=None)
    
    #branch_detector = BranchDetector(min_branch_size=1).fit(clusterer)

    # Fit the model and get cluster labels
    cluster_labels = clusterer.fit_predict(np.array(evs))

    # Extract true labels using the term_to_class_dict
    true_labels = [term_to_class_dict.get(term, -1) for term in terms]

    # Calculate classification accuracy using sklearn's metrics
    ari = adjusted_rand_score(true_labels, cluster_labels)
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)

    # Prepare the data for Plotly graph objects
    x_vals = reduced_embeddings[:, 0]  # X coordinates from t-SNE
    y_vals = reduced_embeddings[:, 1]  # Y coordinates from t-SNE

    # Define colors for each cluster, defaulting -1 (noise) to grey
    unique_labels = list(set(cluster_labels))

    # Create a scatter plot with Plotly graph objects
    fig = go.Figure()
    term_classes = []

    # Add traces for each unique cluster
    for label in unique_labels:
        cluster_points = np.array([reduced_embeddings[i] for i in range(len(cluster_labels)) if cluster_labels[i] == label])
        cluster_terms = [terms[i] for i in range(len(cluster_labels)) if cluster_labels[i] == label]
        for cluster_term in cluster_terms: 
            term_classes.append((label, cluster_term))

        # Differentiate the color for noise points (label == -1)
        marker_color = 'grey' if label == -1 else f'rgb({np.random.randint(0, 255)}, {np.random.randint(0, 255)}, {np.random.randint(0, 255)})'

        # Add cluster trace to the figure
        fig.add_trace(go.Scatter(
            x=cluster_points[:, 0],
            y=cluster_points[:, 1],
            mode='markers+text',
            name=f'Cluster {label}' if label != -1 else 'Noise',
            marker=dict(color=marker_color, size=10, line=dict(width=1)),
            text=cluster_terms,  # Show terms for each point
            textposition='top center'  # Adjust text position
        ))

    # Set the layout for the plot
    fig.update_layout(
        title=f"HDBSCAN Clusters on t-SNE with ARI: {ari:.2f}, NMI: {nmi:.2f}",
        xaxis_title="t-SNE Dim 1",
        yaxis_title="t-SNE Dim 2",
        legend_title="Clusters",
        width=900,
        height=800
    )
    return fig, term_classes, ari, nmi

Manhattan Distance¶

In [8]:
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list, 
                                                   terms=terms,
                                                   term_to_class_dict=term_to_class,
                                                   hdbscan_dist_metric='manhattan')
fig.show()

Euclidean Distance¶

In [10]:
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list, 
                                                   terms=terms,
                                                   term_to_class_dict=term_to_class,
                                                   hdbscan_dist_metric='euclidean')
fig.show()

Increase in Alpha¶

In [12]:
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list, 
                                                   terms=terms,
                                                   term_to_class_dict=term_to_class,
                                                   hdbscan_dist_metric='euclidean',
                                                  alpha=.1)
fig.show()

Increase in Min. Cluster Size¶

In [14]:
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list, 
                                                   terms=terms,
                                                   term_to_class_dict=term_to_class,
                                                   hdbscan_dist_metric='euclidean',
                                                   alpha=.1,
                                                   leaf_size=1, min_cluster_size=5, min_samples=10)
fig.show()